started: Alexey Larionov, Feb2017
last updated: Alexey Larionov, 05May2017
Select 93 genes:
83 genes:
- skat_M burden_svt_p <= 0.05
- aggregated exac-NFE AF <= 0.05
- crude trend call = “risk”
10 genes w/o exac data:
- skat_M burden_svt_p <= 0.05
Explore selected genes
Save selected genes to text file
Plot trend for each selected gene
# Start time
Sys.time()
## [1] "2017-05-05 22:10:52 BST"
# Folders
library(knitr)
base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"
opts_knit$set(root.dir = base_folder)
#setwd(base_folder)
load(paste(base_folder, "results", "r09b_SKAT_M_wecare_only.RData", sep="/"))
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"
load(paste(base_folder, "results", "r10b_invert_aggregate_kgen_exac.RData", sep="/"))
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"
ls()
## [1] "base_folder" "exac.df"
## [3] "genes_aggr_data.mx" "genes_aggr_exac.df"
## [5] "genes_aggr_info.df" "genes_aggr_kgen.df"
## [7] "genes_aggr_skat_M.df" "genotypes_inv_imp_wt.mx"
## [9] "genotypes.mx" "kgen.df"
## [11] "phenotypes.df" "variants.df"
dim(genotypes.mx)
## [1] 18551 478
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026 0 NA 0 0 0
## Var000000029 0 0 0 0 0
## Var000000031 0 0 0 0 0
## Var000000053 0 0 0 0 NA
## Var000000064 0 0 0 0 NA
dim(genotypes_inv_imp_wt.mx)
## [1] 18551 478
class(genotypes_inv_imp_wt.mx)
## [1] "matrix"
genotypes_inv_imp_wt.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026 0 0.05482456 0 0 0.00000000
## Var000000029 0 0.00000000 0 0 0.00000000
## Var000000031 0 0.00000000 0 0 0.00000000
## Var000000053 0 0.00000000 0 0 0.05707763
## Var000000064 0 0.00000000 0 0 0.05518764
dim(genes_aggr_data.mx)
## [1] 9117 478
class(genes_aggr_data.mx)
## [1] "matrix"
genes_aggr_data.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## NOC2L 0 0.05482456 0.00000000 0.000000 0.0000000
## KLHL17 0 0.00000000 0.00000000 0.000000 0.1122653
## AGRN 0 0.65431621 0.00000000 8.602334 0.0000000
## TTLL10 0 0.00000000 0.00000000 0.000000 0.0000000
## PUSL1 0 0.05821459 0.05821459 0.000000 0.0000000
dim(genes_aggr_skat_M.df)
## [1] 9109 12
str(genes_aggr_skat_M.df)
## 'data.frame': 9109 obs. of 12 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : int 3 2 3 1 1 1 1 1 2 1 ...
## $ svt_p : num NA NA NA 0.346 0.711 ...
## $ svt_is_accurate : logi NA NA NA TRUE TRUE TRUE ...
## $ svt_map : num NA NA NA 0.0912 0.2106 ...
## $ burden_p : num 0.533 0.343 0.431 NA NA ...
## $ burden_is_accurate: logi TRUE TRUE TRUE NA NA NA ...
## $ burden_map : num 0.0135 0.0871 -1 NA NA ...
## $ skat_p : num 0.465 0.831 0.344 NA NA ...
## $ skat_is_accurate : logi TRUE TRUE TRUE NA NA NA ...
## $ skat_map : num 0.0118 0.0871 -1 NA NA ...
## $ aggregated_MAC : num 4 2 36 2 1 1 1 41 3 1 ...
genes_aggr_skat_M.df[1:5,1:5]
## gene num_var svt_p svt_is_accurate svt_map
## NOC2L NOC2L 3 NA NA NA
## KLHL17 KLHL17 2 NA NA NA
## AGRN AGRN 3 NA NA NA
## TTLL10 TTLL10 1 0.3459301 TRUE 0.09120194
## PUSL1 PUSL1 1 0.7106340 TRUE 0.21063399
dim(genes_aggr_info.df)
## [1] 9117 27
str(genes_aggr_info.df)
## 'data.frame': 9117 obs. of 27 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ aggr_ac : num 4 2 36 2 1 1 1 41 3 1 ...
## $ aggr_an : num 2808 1782 2792 956 810 ...
## $ aggr_af : num 0.00142 0.00112 0.01289 0.00209 0.00123 ...
## $ aggr_ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ aggr_an_cbc : num 1358 870 1360 466 400 ...
## $ aggr_af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ aggr_ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ aggr_an_ubc : num 1450 912 1432 490 410 ...
## $ aggr_af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ aggr_ac_cbc_fam : num 1 0 5 2 0 0 1 9 0 0 ...
## $ aggr_an_cbc_fam : num 482 306 478 166 144 162 164 164 318 150 ...
## $ aggr_af_cbc_fam : num 0.00207 0 0.01046 0.01205 0 ...
## $ aggr_ac_cbc_nofam: num 2 2 10 0 0 1 0 11 2 0 ...
## $ aggr_an_cbc_nofam: num 876 564 882 300 256 298 296 300 566 290 ...
## $ aggr_af_cbc_nofam: num 0.00228 0.00355 0.01134 0 0 ...
## $ aggr_ac_ubc_fam : num 0 0 4 0 1 0 0 8 0 1 ...
## $ aggr_an_ubc_fam : num 418 262 414 142 120 140 142 142 278 132 ...
## $ aggr_af_ubc_fam : num 0 0 0.00966 0 0.00833 ...
## $ aggr_ac_ubc_nofam: num 1 0 17 0 0 0 0 13 1 0 ...
## $ aggr_an_ubc_nofam: num 1032 650 1018 348 290 ...
## $ aggr_af_ubc_nofam: num 0.000969 0 0.016699 0 0 ...
## $ cbc_ubc_call : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
## $ cbc_ubc_fisher_p : num 0.359 0.238 0.408 0.237 1 ...
genes_aggr_info.df[1:5,1:5]
## gene num_var inverted multiallelic aggr_ac
## NOC2L NOC2L 3 FALSE FALSE 4
## KLHL17 KLHL17 2 FALSE FALSE 2
## AGRN AGRN 3 FALSE FALSE 36
## TTLL10 TTLL10 1 FALSE FALSE 2
## PUSL1 PUSL1 1 FALSE FALSE 1
dim(genes_aggr_kgen.df)
## [1] 9117 16
str(genes_aggr_kgen.df)
## 'data.frame': 9117 obs. of 16 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_kgen : num 2 NA 92 NA 6 NA 4 371 9 NA ...
## $ an_kgen : num 10016 NA 5008 NA 5008 ...
## $ af_kgen : num 0.0002 NA 0.0184 NA 0.0012 ...
## $ ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ an_ubc : num 1450 912 1432 490 410 ...
## $ af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ an_cbc : num 1358 870 1360 466 400 ...
## $ af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ pearson_r : num 0.959 NA -1 NA -0.491 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 NA 1 NA 1 NA 2 1 2 NA ...
## $ crude_trend_test_p: num 0.00168 NA 0.04823 NA 0.75384 ...
genes_aggr_kgen.df[1:5,1:5]
## gene num_var inverted multiallelic ac_kgen
## NOC2L NOC2L 3 FALSE FALSE 2
## KLHL17 KLHL17 2 FALSE FALSE NA
## AGRN AGRN 3 FALSE FALSE 92
## TTLL10 TTLL10 1 FALSE FALSE NA
## PUSL1 PUSL1 1 FALSE FALSE 6
dim(genes_aggr_exac.df)
## [1] 9117 16
str(genes_aggr_exac.df)
## 'data.frame': 9117 obs. of 16 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_exac_NFE : num 14 2 2193 1 32 ...
## $ an_exac_NFE : num 108598 108618 74336 4760 54314 ...
## $ af_exac_NFE : num 1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
## $ ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ an_ubc : num 1450 912 1432 490 410 ...
## $ af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ an_cbc : num 1358 870 1360 466 400 ...
## $ af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ pearson_r : num 0.966 0.863 -0.944 0.844 -0.231 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
## $ crude_trend_test_p: num 1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_exac.df[1:5,1:5]
## gene num_var inverted multiallelic ac_exac_NFE
## NOC2L NOC2L 3 FALSE FALSE 14
## KLHL17 KLHL17 2 FALSE FALSE 2
## AGRN AGRN 3 FALSE FALSE 2193
## TTLL10 TTLL10 1 FALSE FALSE 1
## PUSL1 PUSL1 1 FALSE FALSE 32
dim(kgen.df)
## [1] 18551 9
colnames(kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000026 Var000000026 NA NA NA NA
## Var000000029 Var000000029 1 5008 0.000199681 0
## Var000000031 Var000000031 1 5008 0.000199681 0
## Var000000053 Var000000053 NA NA NA NA
## Var000000064 Var000000064 NA NA NA NA
dim(exac.df)
## [1] 18551 48
colnames(exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000026 Var000000026 NA NA
## Var000000029 Var000000029 1.506e-04 16
## Var000000031 Var000000031 2.354e-04 25
## Var000000053 Var000000053 1.883e-05 2
## Var000000064 Var000000064 9.416e-06 1
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000026 NA NA
## Var000000029 106210 8
## Var000000031 106210 4
## Var000000053 106210 0
## Var000000064 106206 0
dim(variants.df)
## [1] 18551 67
str(variants.df)
## 'data.frame': 18551 obs. of 67 variables:
## $ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 26 29 31 53 64 167 227 237 300 400 ...
## $ SYMBOL : Factor w/ 19862 levels "A1BG","A1CF",..: 11149 11149 11149 8665 8665 540 540 540 18233 13545 ...
## $ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 2 2 2 2 2 2 2 2 2 ...
## $ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ POS : int 881871 883883 883946 897287 897792 979748 987173 989899 1132513 1246373 ...
## $ REF : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1575 539 539 539 1 1575 539 539 1044 ...
## $ ALT : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 323 989 989 989 989 664 989 989 1 ...
## $ ac_raw : int 1 2 1 1 1 53 1 1 2 1 ...
## $ af_raw : num 0.000711 0.001555 0.000774 0.000711 0.000722 ...
## $ an_raw : int 1406 1286 1292 1406 1386 1416 1420 1414 1320 1400 ...
## $ Consequence : Factor w/ 78 levels "3_prime_UTR_variant",..: 66 28 28 28 66 28 28 28 28 28 ...
## $ SIFT_call : Factor w/ 4 levels "deleterious",..: NA 1 1 1 NA 1 1 1 1 1 ...
## $ SIFT_score : num NA 0 0.01 0.04 NA 0.01 0 0.02 0.02 0.05 ...
## $ PolyPhen_call : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 3 NA 3 3 3 3 3 ...
## $ PolyPhen_score : num NA 0.936 0.946 0.997 NA 0.932 0.999 0.991 0.999 0.988 ...
## $ CLIN_SIG : Factor w/ 120 levels "association",..: NA NA NA NA NA 2 NA NA NA NA ...
## $ cDNA_position : Factor w/ 16585 levels "1","?-1","10",..: 4302 3741 3476 13013 15367 5539 11506 11969 4210 15212 ...
## $ CDS_position : Factor w/ 15333 levels "1","?-1","10",..: 3744 3218 2977 10599 13061 4871 10484 10919 3279 13242 ...
## $ Codons : Factor w/ 3014 levels "-/A","-/AA","AAA/-",..: 1025 1858 1306 1172 1300 1864 2503 374 328 1877 ...
## $ Protein_position : Factor w/ 7360 levels "1","?-1","10",..: 6008 5707 5569 1723 2739 6606 1672 1845 5745 2811 ...
## $ Amino_acids : Factor w/ 1851 levels "-/*","*","A",..: 1171 334 1303 1134 1283 381 1809 1594 1578 225 ...
## $ Existing_variation: Factor w/ 309385 levels "","FANCA:c.2534T>C",..: NA 54731 225315 291348 274420 10329 268628 74468 292466 49490 ...
## $ Multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_all : int 1 2 1 1 1 34 1 1 2 1 ...
## $ an_all : num 912 942 954 876 906 894 954 944 956 810 ...
## $ af_all : num 0.0011 0.00212 0.00105 0.00114 0.0011 ...
## $ ac_ubc : int 0 0 1 0 0 20 0 1 0 1 ...
## $ an_ubc : num 472 488 490 450 462 460 488 484 490 410 ...
## $ af_ubc : num 0 0 0.00204 0 0 ...
## $ ac_cbc : int 1 2 0 1 1 14 1 0 2 0 ...
## $ an_cbc : num 440 454 464 426 444 434 466 460 466 400 ...
## $ af_cbc : num 0.00227 0.00441 0 0.00235 0.00225 ...
## $ ac_ubc_fam : int 0 0 0 0 0 4 0 0 0 1 ...
## $ an_ubc_fam : num 134 142 142 130 132 132 142 140 142 120 ...
## $ af_ubc_fam : num 0 0 0 0 0 ...
## $ ac_ubc_nofam : int 0 0 1 0 0 16 0 1 0 0 ...
## $ an_ubc_nofam : num 338 346 348 320 330 328 346 344 348 290 ...
## $ af_ubc_nofam : num 0 0 0.00287 0 0 ...
## $ ac_cbc_fam : int 0 1 0 0 0 5 0 0 2 0 ...
## $ an_cbc_fam : num 156 162 164 152 154 148 166 164 166 144 ...
## $ af_cbc_fam : num 0 0.00617 0 0 0 ...
## $ ac_cbc_nofam : int 1 1 0 1 1 9 1 0 0 0 ...
## $ an_cbc_nofam : num 284 292 300 274 290 286 300 296 300 256 ...
## $ af_cbc_nofam : num 0.00352 0.00342 0 0.00365 0.00345 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_inv : num 1 2 1 1 1 34 1 1 2 1 ...
## $ an_inv : num 912 942 954 876 906 894 954 944 956 810 ...
## $ af_inv : num 0.0011 0.00212 0.00105 0.00114 0.0011 ...
## $ ac_cbc_inv : num 1 2 0 1 1 14 1 0 2 0 ...
## $ an_cbc_inv : num 440 454 464 426 444 434 466 460 466 400 ...
## $ af_cbc_inv : num 0.00227 0.00441 0 0.00235 0.00225 ...
## $ ac_ubc_inv : num 0 0 1 0 0 20 0 1 0 1 ...
## $ an_ubc_inv : num 472 488 490 450 462 460 488 484 490 410 ...
## $ af_ubc_inv : num 0 0 0.00204 0 0 ...
## $ ac_cbc_fam_inv : num 0 1 0 0 0 5 0 0 2 0 ...
## $ an_cbc_fam_inv : num 156 162 164 152 154 148 166 164 166 144 ...
## $ af_cbc_fam_inv : num 0 0.00617 0 0 0 ...
## $ ac_cbc_nofam_inv : num 1 1 0 1 1 9 1 0 0 0 ...
## $ an_cbc_nofam_inv : num 284 292 300 274 290 286 300 296 300 256 ...
## $ af_cbc_nofam_inv : num 0.00352 0.00342 0 0.00365 0.00345 ...
## $ ac_ubc_fam_inv : num 0 0 0 0 0 4 0 0 0 1 ...
## $ an_ubc_fam_inv : num 134 142 142 130 132 132 142 140 142 120 ...
## $ af_ubc_fam_inv : num 0 0 0 0 0 ...
## $ ac_ubc_nofam_inv : num 0 0 1 0 0 16 0 1 0 0 ...
## $ an_ubc_nofam_inv : num 338 346 348 320 330 328 346 344 348 290 ...
## $ af_ubc_nofam_inv : num 0 0 0.00287 0 0 ...
## $ weight : num 25 25 23.8 25 25 ...
variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000026 Var000000026 NOC2L SNP 1 881871
## Var000000029 Var000000029 NOC2L SNP 1 883883
## Var000000031 Var000000031 NOC2L SNP 1 883946
## Var000000053 Var000000053 KLHL17 SNP 1 897287
## Var000000064 Var000000064 KLHL17 SNP 1 897792
dim(phenotypes.df)
## [1] 478 36
str(phenotypes.df)
## 'data.frame': 478 obs. of 36 variables:
## $ wes_id : chr "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## $ gwas_id : chr "id203568" "id357807" "id325472" "id304354" ...
## $ merged_id : chr "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : int 1 1 1 1 1 1 0 1 1 0 ...
## $ setno : int 203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
## $ registry : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
## $ family_history: int 1 0 1 1 1 1 1 1 1 0 ...
## $ age_dx : int 49 35 32 33 44 28 28 38 35 36 ...
## $ age_ref : num 58 36 41 34 49 28 32 44 35 38 ...
## $ rstime : num 10.17 1.83 9.75 1.59 5.66 ...
## $ eig1_gwas : num -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
## $ eig2_gwas : num 0.00266 0.00246 -0.0001 0.00595 0.01157 ...
## $ eig3_gwas : num 0.06803 0.05055 -0.00603 0.00747 0.00144 ...
## $ eig4_gwas : num -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
## $ eig5_gwas : num -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
## $ stage : num 1 2 2 1 1 1 2 1 2 1 ...
## $ er : num NA 0 0 0 NA 1 1 1 1 0 ...
## $ pr : num NA 0 0 NA NA 1 NA 1 0 0 ...
## $ hist_cat : chr "lobular" "ductal" "medullary" "ductal" ...
## $ hormone : num 0 0 0 0 1 0 0 0 0 0 ...
## $ chemo_cat : chr "no" "CMF" "Oth" "no" ...
## $ br_xray : int 1 1 0 0 1 0 0 0 1 1 ...
## $ br_xray_dose : num 1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
## $ age_menarche : num 9 13 10 12 10 13 12 11 11 NA ...
## $ age_1st_ftp : num 30 0 26 0 17 0 25 28 27 18 ...
## $ age_menopause : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ num_preg : num 1 0 1 0 1 0 1 1 1 1 ...
## $ bmi_age18 : num 20.8 22.5 23.6 18.6 19.9 ...
## $ bmi_dx : num 25.8 22.9 28.3 17.8 26.6 ...
## $ bmi_ref : num 33.3 22.9 33.1 17.8 29.8 ...
## $ eig1 : num -0.1566 -0.13939 -0.00225 -0.00914 0.01841 ...
## $ eig2 : num 0.08741 0.06847 -0.00239 0.0008 -0.02139 ...
## $ eig3 : num -0.0185 -0.0117 -0.0185 -0.0713 -0.0101 ...
## $ eig4 : num -0.03333 0.03332 0.01482 0.00719 -0.01141 ...
## $ eig5 : num 0.02116 0.09035 0.00663 0.01628 -0.02406 ...
phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568 pass 1
## P1_A02 P1_A02 id357807 P1_A02_id357807 pass 1
## P1_A03 P1_A03 id325472 P1_A03_id325472 pass 1
## P1_A04 P1_A04 id304354 P1_A04_id304354 pass 1
## P1_A05 P1_A05 id222648 P1_A05_id222648 pass 1
# Check consistency of rownames and colnames
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(colnames(genes_aggr_data.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_data.mx))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_kgen.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_exac.df))
## [1] 0
# sum(rownames(genes_aggr_info.df) != rownames(Genes_aggr_skat_M.df))
# NB: - see comment before the next chunk
sum(rownames(genotypes.mx) != rownames(genotypes_inv_imp_wt.mx))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
Keep only genes present in SKAT-M
Genes_aggr_skat_M.df has different number of rows(genes) than other genes_aggr files !!! This is OK: 8 genes failed SKAT because of low call rate ( = high NA rate).
Our filtering used 80% call rate, while default SKAT used 85% NA rate filter.
In future I may change our filter to 85%, for consistency.
# Get SKAT-M genes
skat_m_genes <- rownames(genes_aggr_skat_M.df)
# Sync
genes_aggr_info.df <- genes_aggr_info.df[skat_m_genes,]
genes_aggr_data.mx <- genes_aggr_data.mx[skat_m_genes,]
genes_aggr_kgen.df <- genes_aggr_kgen.df[skat_m_genes,]
genes_aggr_exac.df <- genes_aggr_exac.df[skat_m_genes,]
# Clean-up
rm(skat_m_genes)
# Check burden and svt p values
summary(genes_aggr_skat_M.df$svt_is_accurate)
## Mode TRUE NA's
## logical 4688 4421
summary(genes_aggr_skat_M.df$burden_is_accurate)
## Mode TRUE NA's
## logical 4421 4688
# Merge burden and svt p values
svt_p <- genes_aggr_skat_M.df$svt_p
burden_p <- genes_aggr_skat_M.df$burden_p
burden_svt_p <- ifelse(is.na(svt_p),burden_p,svt_p)
summary(burden_svt_p) # no NA
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0004635 0.2270000 0.5397000 0.4987000 0.7361000 1.0000000
# Make aggregated SKAT_M_exac table
genes_aggr_skat_M_exac.df <- cbind(
genes_aggr_skat_M.df[,c("gene", "num_var", "aggregated_MAC")],
burden_svt_p,
genes_aggr_exac.df[,c("inverted", "multiallelic",
"ac_exac_NFE", "an_exac_NFE", "af_exac_NFE",
"ac_ubc", "an_ubc", "af_ubc",
"ac_cbc", "an_cbc", "af_cbc",
"pearson_r", "trend_call", "crude_trend_test_p")])
# Clean-up
rm(svt_p, burden_p, burden_svt_p)
# Explore genes_aggr_skat_M.df
nrow(genes_aggr_skat_M_exac.df)
## [1] 9109
summary(genes_aggr_skat_M_exac.df$trend_call)
## protective risk uncertain NA's
## 2992 3980 103 2034
hist(genes_aggr_skat_M_exac.df$burden_svt_p,
breaks=20, ylim=c(0,2000), labels=TRUE,
xlab="p-values", ylab="genes count",
main="Histogram of SKAT-M burden p-values")
# Note that expected p-values are not uniform!
# (see Lee et al Biostatistics 2016)
hist(genes_aggr_skat_M_exac.df$af_exac_NFE,
breaks=20, ylim=c(0,6000), labels=TRUE,
xlab="aggregated exac AF", ylab="genes count",
main="Histogram of aggregated exac AFs")
# Genes with exac data:
# "Rare" (exac <=0.05), "risk" genes with burden_p <= 0.05
selected_genes_with_exac.df <- genes_aggr_skat_M_exac.df[!is.na(genes_aggr_skat_M_exac.df$af_exac_NFE),]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$burden_svt_p <= 0.05,]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$af_exac_NFE <= 0.05,]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$trend_call=="risk",]
selected_genes_with_exac.df <- selected_genes_with_exac.df[order(selected_genes_with_exac.df$burden_svt_p),]
dim(selected_genes_with_exac.df)
## [1] 83 18
# Genes without exac data: burden_p <= 0.05
selected_genes_no_exac.df <- genes_aggr_skat_M_exac.df[is.na(genes_aggr_skat_M_exac.df$trend_call),]
selected_genes_no_exac.df <- selected_genes_no_exac.df[selected_genes_no_exac.df$burden_svt_p <= 0.05,]
selected_genes_no_exac.df <- selected_genes_no_exac.df[order(selected_genes_no_exac.df$burden_svt_p),]
dim(selected_genes_no_exac.df)
## [1] 10 18
# Merge selected genes with and w/o exac data
selected_genes.df <- rbind(selected_genes_with_exac.df, selected_genes_no_exac.df)
selected_genes.df <- selected_genes.df[order(selected_genes.df$burden_svt_p),]
dim(selected_genes.df)
## [1] 93 18
# Print selected genes
selected_genes.df[,c("num_var","burden_svt_p", "af_exac_NFE", "af_ubc", "af_cbc", "crude_trend_test_p")]
## num_var burden_svt_p af_exac_NFE af_ubc af_cbc
## FAM171B 4 0.001368650 2.957000e-04 0.0096982759 0.0005630631
## EPHB2 2 0.004438192 3.075506e-03 0.0000000000 0.0068027211
## FOXM1 6 0.005854754 3.119452e-03 0.0046001415 0.0092387288
## TMEM134 1 0.007589152 6.606951e-03 0.0000000000 0.0115740741
## IQCA1 2 0.008472387 1.185144e-03 0.0000000000 0.0054704595
## GALNT12 5 0.010213508 2.526592e-03 0.0008176615 0.0043066322
## TIMELESS 7 0.010587495 2.624044e-04 0.0700541191 0.0643627140
## DBI 3 0.012421156 1.374419e-03 0.0000000000 0.0036179450
## TMEM139 1 0.012506083 NA 0.0103305785 0.0000000000
## CCDC27 4 0.013233760 1.567629e-04 0.0000000000 0.0026910657
## MYO1D 3 0.013250152 7.745648e-04 0.0000000000 0.0029154519
## ERCC6 6 0.013369314 7.560670e-04 0.0003465003 0.0025399129
## TROAP 4 0.013405362 7.079363e-04 0.0000000000 0.0032715376
## ECT2L 11 0.013495734 6.792698e-04 0.0005668934 0.0021964856
## ECM2 3 0.013579767 1.523398e-03 0.0000000000 0.0036764706
## DUOX1 5 0.013916339 1.178985e-03 0.0000000000 0.0021872266
## OR1S2 1 0.014844676 NA 0.0000000000 0.0066079295
## SH3RF2 1 0.016153907 NA 0.0000000000 0.0085836910
## DPP4 2 0.017842165 1.925356e-02 0.0132450331 0.0300925926
## MDM1 2 0.018886855 1.079157e-03 0.0000000000 0.0065789474
## ECM1 3 0.018935200 5.759923e-03 0.0013605442 0.0086330935
## REV3L 6 0.018944101 6.782671e-05 0.0000000000 0.0022075055
## PDE5A 1 0.019038427 NA 0.0153508772 0.0023255814
## LRRIQ3 4 0.019133259 6.893556e-04 0.0032467532 0.0085812357
## SLC4A3 4 0.019420337 1.746420e-04 0.0000000000 0.0022831050
## NOC3L 6 0.019943641 4.128891e-03 0.0017241379 0.0051132213
## SLC4A4 2 0.020456081 1.455926e-03 0.0000000000 0.0048543689
## RP9 1 0.021397689 1.017962e-02 0.0000000000 0.0129310345
## MEF2A 3 0.023086387 1.249017e-03 0.0000000000 0.0030165913
## SLC28A2 4 0.023450965 2.762227e-04 0.0000000000 0.0027995521
## GBGT1 1 0.023850873 5.520592e-03 0.0000000000 0.0086580087
## C8A 2 0.024934306 5.741838e-03 0.0010638298 0.0088888889
## C6orf201 1 0.025197315 NA 0.0877551020 0.0536480687
## TMCO4 5 0.025604399 1.953389e-03 0.0004237288 0.0026833631
## ITIH6 4 0.025965010 2.645236e-04 0.0000000000 0.0022547914
## ADAMTSL4 3 0.025986563 8.010253e-05 0.0127840909 0.0060331825
## RNF135 1 0.026109094 NA 0.0040983607 0.0214592275
## SCN5A 4 0.026326627 4.609230e-05 0.0000000000 0.0021786492
## RDX 4 0.026345468 1.484616e-04 0.0000000000 0.0021621622
## SERPIND1 1 0.027559056 1.784860e-03 0.0000000000 0.0086206897
## HOXD13 4 0.027629598 6.908749e-04 0.0000000000 0.0021528525
## METTL20 2 0.027958442 3.943568e-03 0.0011013216 0.0067720090
## SLC9A3R1 5 0.029060352 1.773017e-03 0.0004201681 0.0030946065
## ENTPD3 4 0.030180084 6.692208e-04 0.0000000000 0.0022296544
## LRIT3 1 0.030592502 2.787953e-02 0.0247933884 0.0504385965
## MRPL51 3 0.031142574 8.409263e-04 0.0020833333 0.0072463768
## SCRN3 4 0.032209984 NA 0.0851619645 0.0762527233
## CENPJ 2 0.032560822 1.410450e-03 0.0000000000 0.0032537961
## DEPTOR 2 0.032590560 1.289301e-04 0.0000000000 0.0033860045
## C7orf31 2 0.033264599 6.808229e-04 0.0295358650 0.0156950673
## WDR63 1 0.033343146 NA 0.0509259259 0.0857843137
## KRT12 3 0.033837477 1.632062e-03 0.0000000000 0.0021582734
## IGKC 3 0.034368191 1.238896e-03 0.0000000000 0.0021645022
## ITGA9 2 0.035439759 4.603119e-05 0.0000000000 0.0032188841
## RALGAPB 2 0.036508480 1.390142e-03 0.0000000000 0.0033112583
## C19orf59 1 0.036803160 4.697198e-03 0.0000000000 0.0064935065
## ZNF44 4 0.037896075 2.875276e-03 0.0010214505 0.0043196544
## DXO 1 0.038363244 5.668016e-03 0.0000000000 0.0107296137
## TCF24 1 0.039173952 6.772009e-03 0.0020491803 0.0129870130
## GYG1 2 0.039456483 1.089003e-03 0.0000000000 0.0053879310
## PLEKHG3 4 0.039529472 6.124381e-04 0.0000000000 0.0026968716
## STK11IP 4 0.040193202 9.105448e-03 0.0067920585 0.0137969095
## FARS2 3 0.040703069 2.039613e-03 0.0000000000 0.0037147103
## OR52K1 2 0.041547109 1.509434e-03 0.0000000000 0.0055928412
## NRG1 2 0.041964650 1.841688e-05 0.0000000000 0.0034324943
## GALT 3 0.042044605 1.150039e-03 0.0000000000 0.0021551724
## CYP2A13 2 0.042085843 5.796835e-04 0.0000000000 0.0032188841
## SVOPL 5 0.042100979 2.654720e-03 0.0480295567 0.0520833333
## GPR87 2 0.042160022 1.108525e-04 0.0000000000 0.0033333333
## NF1 3 0.042363028 5.520389e-05 0.0000000000 0.0021645022
## PPAT 1 0.042745150 2.576561e-03 0.0000000000 0.0107758621
## P2RX2 1 0.042958560 2.177765e-03 0.0000000000 0.0066371681
## RAX2 2 0.043039110 1.266702e-02 0.0088495575 0.0214797136
## ARAP1 5 0.043136222 4.299701e-05 0.0020475020 0.0004347826
## S100Z 1 0.043139506 NA 0.0000000000 0.0065502183
## TIMM21 3 0.043252002 4.203315e-04 0.0000000000 0.0036023055
## SH2B2 2 0.043273371 7.375241e-03 0.0271149675 0.0417607223
## IQCE 2 0.043608698 1.167078e-02 0.0040816327 0.0118790497
## GPR133 2 0.043892368 1.662771e-02 0.0103092784 0.0231277533
## ACOX3 3 0.044599960 4.725005e-05 0.0000000000 0.0021645022
## DCLRE1B 2 0.044863978 3.680259e-05 0.0000000000 0.0021645022
## CACNA1H 5 0.044938450 1.301784e-03 0.0000000000 0.0031418312
## UNC5A 2 0.045014013 1.861339e-03 0.0000000000 0.0034642032
## EIF4ENIF1 3 0.045114687 NA 0.0000000000 0.0021520803
## CC2D1B 4 0.047255323 0.000000e+00 0.0000000000 0.0022446689
## APLP2 3 0.047439268 1.472890e-04 0.0000000000 0.0021645022
## ALG6 4 0.047926587 1.449723e-02 0.0098855359 0.0168478261
## FREM2 2 0.048533489 3.964961e-04 0.0000000000 0.0032467532
## OR4X2 3 0.048881780 3.681207e-05 0.0000000000 0.0022123894
## OR5D18 2 0.049182573 3.350022e-02 0.0224489796 0.0378787879
## MYO9B 4 0.049238885 8.759907e-04 0.0010351967 0.0033039648
## BIN3 2 0.049250431 3.702744e-04 0.0000000000 0.0032822757
## EGFLAM 7 0.049359636 6.470041e-04 0.0003021148 0.0022194039
## crude_trend_test_p
## FAM171B 2.151509e-18
## EPHB2 3.220092e-01
## FOXM1 3.023015e-08
## TMEM134 7.480053e-01
## IQCA1 4.919816e-03
## GALNT12 4.486884e-01
## TIMELESS 0.000000e+00
## DBI 1.775266e-01
## TMEM139 NA
## CCDC27 2.848629e-11
## MYO1D 4.295755e-02
## ERCC6 8.627454e-03
## TROAP 2.090565e-03
## ECT2L 5.266030e-04
## ECM2 2.574508e-01
## DUOX1 6.170625e-01
## OR1S2 NA
## SH3RF2 NA
## DPP4 1.435035e-01
## MDM1 7.335189e-05
## ECM1 7.964537e-01
## REV3L 7.679104e-22
## PDE5A NA
## LRRIQ3 5.237513e-31
## SLC4A3 8.998370e-08
## NOC3L 8.458125e-01
## SLC4A4 7.893069e-02
## RP9 6.265937e-01
## MEF2A 3.068141e-01
## SLC28A2 2.843605e-07
## GBGT1 9.492073e-01
## C8A 7.882713e-01
## C6orf201 NA
## TMCO4 9.543098e-01
## ITIH6 7.863809e-05
## ADAMTSL4 7.219679e-109
## RNF135 NA
## SCN5A 6.236184e-19
## RDX 4.536315e-07
## SERPIND1 8.375230e-03
## HOXD13 1.128662e-01
## METTL20 5.545568e-01
## SLC9A3R1 5.295468e-01
## ENTPD3 7.947945e-02
## LRIT3 2.364205e-02
## MRPL51 4.104889e-14
## SCRN3 NA
## CENPJ 4.273692e-01
## DEPTOR 1.276417e-11
## C7orf31 3.861723e-98
## WDR63 NA
## KRT12 7.902473e-01
## IGKC 7.949823e-01
## ITGA9 1.425096e-23
## RALGAPB 3.905950e-01
## C19orf59 8.627859e-01
## ZNF44 7.290773e-01
## DXO 5.890828e-01
## TCF24 3.585680e-01
## GYG1 2.583234e-03
## PLEKHG3 7.809027e-03
## STK11IP 1.673924e-01
## FARS2 6.613277e-01
## OR52K1 2.644906e-02
## NRG1 7.777956e-26
## GALT 6.895987e-01
## CYP2A13 9.656618e-03
## SVOPL 0.000000e+00
## GPR87 4.326746e-11
## NF1 3.295114e-11
## PPAT 1.060213e-02
## P2RX2 1.784944e-01
## RAX2 1.218281e-01
## ARAP1 6.051181e-12
## S100Z NA
## TIMM21 3.560414e-06
## SH2B2 2.272441e-38
## IQCE 3.451536e-01
## GPR133 5.092634e-01
## ACOX3 1.127528e-15
## DCLRE1B 2.094227e-11
## CACNA1H 1.811436e-01
## UNC5A 6.958668e-01
## EIF4ENIF1 NA
## CC2D1B 9.907819e-09
## APLP2 1.041983e-06
## ALG6 9.931371e-01
## FREM2 4.899878e-04
## OR4X2 9.299060e-14
## OR5D18 8.334446e-01
## MYO9B 1.658507e-03
## BIN3 3.608443e-04
## EGFLAM 6.818402e-03
# Explore selected genes
dim(selected_genes.df) # 93 x 18
## [1] 93 18
summary(selected_genes.df$inverted)
## Mode FALSE TRUE NA's
## logical 90 3 0
summary(selected_genes.df$multiallelic)
## Mode FALSE TRUE NA's
## logical 90 3 0
hist(selected_genes.df$num_var,
labels=TRUE, xlim=c(0,12), xlab="num of variants", ylim=c(0,60), ylab="genes count",
main="Histogram of numbers of variants in selected genes")
hist(selected_genes.df$aggregated_MAC,
labels=TRUE, xlab="aggregated MAC", ylim=c(0,100), ylab="genes count",
main="Histogram of aggregated MAC in selected genes")
hist(selected_genes.df$aggregated_MAC[selected_genes.df$aggregated_MAC < 50],
labels=TRUE, breaks = 46,
xlim=c(0,50), xlab="aggregated MAC", ylim=c(0,25), ylab="genes count",
main="Histogram of aggregated MAC in selected genes (zoom < 50)")
hist(selected_genes.df$af_exac_NFE,
labels=TRUE, xlim=c(0,0.05), xlab="aggregated exac AF", ylim=c(0,80), ylab="genes count",
main="Histogram of aggregated exac AF in selected genes")
# Write result table to text file
results_file <- paste(base_folder, "results", "r10c_SKAT_M_and_crude_trends.txt", sep="/")
write.table(selected_genes.df, file=results_file, quote=FALSE, sep="\t")
# Clean-up
rm(results_file)
genes <- rownames(selected_genes_with_exac.df)
length(genes) # 83
## [1] 83
for(gene in genes){
#gene <- "FOXM1"
x <- as.numeric(selected_genes_with_exac.df[gene, c("af_exac_NFE", "af_ubc", "af_cbc"), drop=TRUE])
exac_counts <- (selected_genes_with_exac.df[gene, c("ac_exac_NFE", "an_exac_NFE"), drop=TRUE])
exac_counts <- paste(exac_counts, collapse = "/")
exac_counts <- paste(round(x[1],5), " (", exac_counts, ")", sep="")
ubc_counts <- (selected_genes_with_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
ubc_counts <- paste(ubc_counts, collapse = "/")
ubc_counts <- paste(round(x[2],5), " (", ubc_counts, ")", sep="")
cbc_counts <- (selected_genes_with_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
cbc_counts <- paste(cbc_counts, collapse = "/")
cbc_counts <- paste(round(x[3],5), " (", cbc_counts, ")", sep="")
counts <- c(exac_counts, ubc_counts, cbc_counts)
p <- barplot(x, names=c("af_exac_NFE", "af_ubc", "af_cbc"),
ylim=c(0, 1.1*max(x)), ylab="aggregated AF",
main=paste(gene,"\ncrude counts"))
text(p, x, labels=counts, pos=3, offset=.5)
}
rm(gene, genes, x, p, exac_counts, ubc_counts, cbc_counts, counts, selected_genes_with_exac.df)
genes <- rownames(selected_genes_no_exac.df)
length(genes) # 10
## [1] 10
for(gene in genes){
#gene <- "TMEM139"
x <- as.numeric(selected_genes_no_exac.df[gene, c("af_ubc", "af_cbc"), drop=TRUE])
ubc_counts <- (selected_genes_no_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
ubc_counts <- paste(ubc_counts, collapse = "/")
ubc_counts <- paste(round(x[1],5), " (", ubc_counts, ")", sep="")
cbc_counts <- (selected_genes_no_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
cbc_counts <- paste(cbc_counts, collapse = "/")
cbc_counts <- paste(round(x[2],5), " (", cbc_counts, ")", sep="")
counts <- c(ubc_counts, cbc_counts)
p <- barplot(x, names=c("af_ubc", "af_cbc"),
ylim=c(0, 1.1*max(x)), ylab="aggregated AF",
main=paste(gene,"\ncrude counts"))
text(p, x, labels=counts, pos=3, offset=.5)
}
rm(gene, genes, x, p, ubc_counts, cbc_counts, counts, selected_genes_no_exac.df)
Assuming that candidates have exac data
candidates <- c("FOXM1", "SLC9A3R1", "ACACA",
"ATM", "CHEK2", "FANCB",
"EPHB2", "TIMELESS", "ERCC6", "REV3L", "PDK4", "HDAC6",
"TLR5", "IGKC", "THBS4", "EID3", "AKAP13", "NRG1", "PLK3", "INCENP", "NF1", "CHRNA9")
genes_aggr_skat_M_exac.df[candidates, c("af_exac_NFE", "af_ubc", "af_cbc", "trend_call", "burden_svt_p")]
## af_exac_NFE af_ubc af_cbc trend_call burden_svt_p
## FOXM1 3.119452e-03 0.0046001415 0.009238729 risk 0.005854754
## SLC9A3R1 1.773017e-03 0.0004201681 0.003094607 risk 0.029060352
## ACACA 1.840130e-05 0.0005122951 0.001628664 risk 0.231834803
## ATM 6.219315e-04 0.0014744913 0.002162496 risk 0.265714946
## CHEK2 4.028686e-04 0.0014611338 0.002770936 risk 0.135053181
## FANCB 8.716822e-02 0.0857740586 0.111842105 risk 0.193727742
## EPHB2 3.075506e-03 0.0000000000 0.006802721 risk 0.004438192
## TIMELESS 2.624044e-04 0.0700541191 0.064362714 risk 0.010587495
## ERCC6 7.560670e-04 0.0003465003 0.002539913 risk 0.013369314
## REV3L 6.782671e-05 0.0000000000 0.002207506 risk 0.018944101
## PDK4 6.978547e-03 0.0126582278 0.000000000 protective 0.020845472
## HDAC6 3.441264e-03 0.0081632653 0.000000000 protective 0.077788594
## TLR5 7.060521e-02 0.0761904762 0.059798271 protective 0.033865983
## IGKC 1.238896e-03 0.0000000000 0.002164502 risk 0.034368191
## THBS4 1.624306e-03 0.0051546392 0.000000000 protective 0.042211267
## EID3 1.880256e-03 0.0051020408 0.000000000 protective 0.042893414
## AKAP13 2.515029e-04 0.0025614754 0.000000000 protective 0.043316528
## NRG1 1.841688e-05 0.0000000000 0.003432494 risk 0.041964650
## PLK3 1.478524e-04 0.0000000000 0.002380952 risk 0.050357879
## INCENP 3.171559e-03 0.0061728395 0.000000000 protective 0.040107295
## NF1 5.520389e-05 0.0000000000 0.002164502 risk 0.042363028
## CHRNA9 6.198220e-04 0.0035014006 0.000000000 protective 0.044378320
for(gene in candidates){
#gene <- "FOXM1"
#gene <- "TMEM139" # no exac data
# Skip gene if it does not have exac data
if(is.na(genes_aggr_skat_M_exac.df[gene,"af_exac_NFE"])) {
print(paste(gene, "has no exac data"))
next
}
x <- as.numeric(genes_aggr_skat_M_exac.df[gene, c("af_exac_NFE", "af_ubc", "af_cbc"), drop=TRUE])
exac_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_exac_NFE", "an_exac_NFE"), drop=TRUE])
exac_counts <- paste(exac_counts, collapse = "/")
exac_counts <- paste(round(x[1],5), " (", exac_counts, ")", sep="")
ubc_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
ubc_counts <- paste(ubc_counts, collapse = "/")
ubc_counts <- paste(round(x[2],5), " (", ubc_counts, ")", sep="")
cbc_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
cbc_counts <- paste(cbc_counts, collapse = "/")
cbc_counts <- paste(round(x[3],5), " (", cbc_counts, ")", sep="")
counts <- c(exac_counts, ubc_counts, cbc_counts)
p <- barplot(x, names=c("af_exac_NFE", "af_ubc", "af_cbc"),
ylim=c(0, 1.1*max(x)), ylab="aggregated AF",
main=paste(gene,"\ncrude counts"))
text(p, x, labels=counts, pos=3, offset=.5)
}
rm(gene, candidates, x, p, exac_counts, ubc_counts, cbc_counts, counts)
dim(genotypes.mx)
## [1] 18551 478
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026 0 NA 0 0 0
## Var000000029 0 0 0 0 0
## Var000000031 0 0 0 0 0
## Var000000053 0 0 0 0 NA
## Var000000064 0 0 0 0 NA
dim(genotypes_inv_imp_wt.mx)
## [1] 18551 478
class(genotypes_inv_imp_wt.mx)
## [1] "matrix"
genotypes_inv_imp_wt.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026 0 0.05482456 0 0 0.00000000
## Var000000029 0 0.00000000 0 0 0.00000000
## Var000000031 0 0.00000000 0 0 0.00000000
## Var000000053 0 0.00000000 0 0 0.05707763
## Var000000064 0 0.00000000 0 0 0.05518764
dim(genes_aggr_data.mx)
## [1] 9109 478
class(genes_aggr_data.mx)
## [1] "matrix"
genes_aggr_data.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## NOC2L 0 0.05482456 0.00000000 0.000000 0.0000000
## KLHL17 0 0.00000000 0.00000000 0.000000 0.1122653
## AGRN 0 0.65431621 0.00000000 8.602334 0.0000000
## TTLL10 0 0.00000000 0.00000000 0.000000 0.0000000
## PUSL1 0 0.05821459 0.05821459 0.000000 0.0000000
dim(genes_aggr_skat_M.df)
## [1] 9109 12
str(genes_aggr_skat_M.df)
## 'data.frame': 9109 obs. of 12 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : int 3 2 3 1 1 1 1 1 2 1 ...
## $ svt_p : num NA NA NA 0.346 0.711 ...
## $ svt_is_accurate : logi NA NA NA TRUE TRUE TRUE ...
## $ svt_map : num NA NA NA 0.0912 0.2106 ...
## $ burden_p : num 0.533 0.343 0.431 NA NA ...
## $ burden_is_accurate: logi TRUE TRUE TRUE NA NA NA ...
## $ burden_map : num 0.0135 0.0871 -1 NA NA ...
## $ skat_p : num 0.465 0.831 0.344 NA NA ...
## $ skat_is_accurate : logi TRUE TRUE TRUE NA NA NA ...
## $ skat_map : num 0.0118 0.0871 -1 NA NA ...
## $ aggregated_MAC : num 4 2 36 2 1 1 1 41 3 1 ...
genes_aggr_skat_M.df[1:5,1:5]
## gene num_var svt_p svt_is_accurate svt_map
## NOC2L NOC2L 3 NA NA NA
## KLHL17 KLHL17 2 NA NA NA
## AGRN AGRN 3 NA NA NA
## TTLL10 TTLL10 1 0.3459301 TRUE 0.09120194
## PUSL1 PUSL1 1 0.7106340 TRUE 0.21063399
dim(genes_aggr_skat_M_exac.df)
## [1] 9109 18
str(genes_aggr_skat_M_exac.df)
## 'data.frame': 9109 obs. of 18 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : int 3 2 3 1 1 1 1 1 2 1 ...
## $ aggregated_MAC : num 4 2 36 2 1 1 1 41 3 1 ...
## $ burden_svt_p : num 0.533 0.343 0.431 0.346 0.711 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_exac_NFE : num 14 2 2193 1 32 ...
## $ an_exac_NFE : num 108598 108618 74336 4760 54314 ...
## $ af_exac_NFE : num 1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
## $ ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ an_ubc : num 1450 912 1432 490 410 ...
## $ af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ an_cbc : num 1358 870 1360 466 400 ...
## $ af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ pearson_r : num 0.966 0.863 -0.944 0.844 -0.231 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
## $ crude_trend_test_p: num 1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_skat_M_exac.df[1:5,1:5]
## gene num_var aggregated_MAC burden_svt_p inverted
## NOC2L NOC2L 3 4 0.5333523 FALSE
## KLHL17 KLHL17 2 2 0.3433343 FALSE
## AGRN AGRN 3 36 0.4308774 FALSE
## TTLL10 TTLL10 1 2 0.3459301 FALSE
## PUSL1 PUSL1 1 1 0.7106340 FALSE
dim(selected_genes.df)
## [1] 93 18
str(selected_genes.df)
## 'data.frame': 93 obs. of 18 variables:
## $ gene : chr "FAM171B" "EPHB2" "FOXM1" "TMEM134" ...
## $ num_var : int 4 2 6 1 2 5 7 3 1 4 ...
## $ aggregated_MAC : num 19 6 38 5 5 12 436 5 5 5 ...
## $ burden_svt_p : num 0.00137 0.00444 0.00585 0.00759 0.00847 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_exac_NFE : num 31 334 492 357 118 686 57 224 NA 17 ...
## $ an_exac_NFE : num 104836 108600 157720 54034 99566 ...
## $ af_exac_NFE : num 0.000296 0.003076 0.003119 0.006607 0.001185 ...
## $ ac_ubc : num 18 0 13 0 0 2 233 0 5 0 ...
## $ an_ubc : num 1856 950 2826 470 974 ...
## $ af_ubc : num 0.0097 0 0.0046 0 0 ...
## $ ac_cbc : num 1 6 25 5 5 10 203 5 0 5 ...
## $ an_cbc : num 1776 882 2706 432 914 ...
## $ af_cbc : num 0.000563 0.006803 0.009239 0.011574 0.00547 ...
## $ pearson_r : num 0.025 0.547 0.958 0.428 0.745 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 2 2 2 2 2 2 2 NA 2 ...
## $ crude_trend_test_p: num 2.15e-18 3.22e-01 3.02e-08 7.48e-01 4.92e-03 ...
selected_genes.df[1:5,1:5]
## gene num_var aggregated_MAC burden_svt_p inverted
## FAM171B FAM171B 4 19 0.001368650 FALSE
## EPHB2 EPHB2 2 6 0.004438192 FALSE
## FOXM1 FOXM1 6 38 0.005854754 FALSE
## TMEM134 TMEM134 1 5 0.007589152 FALSE
## IQCA1 IQCA1 2 5 0.008472387 FALSE
dim(genes_aggr_info.df)
## [1] 9109 27
str(genes_aggr_info.df)
## 'data.frame': 9109 obs. of 27 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ aggr_ac : num 4 2 36 2 1 1 1 41 3 1 ...
## $ aggr_an : num 2808 1782 2792 956 810 ...
## $ aggr_af : num 0.00142 0.00112 0.01289 0.00209 0.00123 ...
## $ aggr_ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ aggr_an_cbc : num 1358 870 1360 466 400 ...
## $ aggr_af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ aggr_ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ aggr_an_ubc : num 1450 912 1432 490 410 ...
## $ aggr_af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ aggr_ac_cbc_fam : num 1 0 5 2 0 0 1 9 0 0 ...
## $ aggr_an_cbc_fam : num 482 306 478 166 144 162 164 164 318 150 ...
## $ aggr_af_cbc_fam : num 0.00207 0 0.01046 0.01205 0 ...
## $ aggr_ac_cbc_nofam: num 2 2 10 0 0 1 0 11 2 0 ...
## $ aggr_an_cbc_nofam: num 876 564 882 300 256 298 296 300 566 290 ...
## $ aggr_af_cbc_nofam: num 0.00228 0.00355 0.01134 0 0 ...
## $ aggr_ac_ubc_fam : num 0 0 4 0 1 0 0 8 0 1 ...
## $ aggr_an_ubc_fam : num 418 262 414 142 120 140 142 142 278 132 ...
## $ aggr_af_ubc_fam : num 0 0 0.00966 0 0.00833 ...
## $ aggr_ac_ubc_nofam: num 1 0 17 0 0 0 0 13 1 0 ...
## $ aggr_an_ubc_nofam: num 1032 650 1018 348 290 ...
## $ aggr_af_ubc_nofam: num 0.000969 0 0.016699 0 0 ...
## $ cbc_ubc_call : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
## $ cbc_ubc_fisher_p : num 0.359 0.238 0.408 0.237 1 ...
genes_aggr_info.df[1:5,1:5]
## gene num_var inverted multiallelic aggr_ac
## NOC2L NOC2L 3 FALSE FALSE 4
## KLHL17 KLHL17 2 FALSE FALSE 2
## AGRN AGRN 3 FALSE FALSE 36
## TTLL10 TTLL10 1 FALSE FALSE 2
## PUSL1 PUSL1 1 FALSE FALSE 1
dim(genes_aggr_kgen.df)
## [1] 9109 16
str(genes_aggr_kgen.df)
## 'data.frame': 9109 obs. of 16 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_kgen : num 2 NA 92 NA 6 NA 4 371 9 NA ...
## $ an_kgen : num 10016 NA 5008 NA 5008 ...
## $ af_kgen : num 0.0002 NA 0.0184 NA 0.0012 ...
## $ ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ an_ubc : num 1450 912 1432 490 410 ...
## $ af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ an_cbc : num 1358 870 1360 466 400 ...
## $ af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ pearson_r : num 0.959 NA -1 NA -0.491 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 NA 1 NA 1 NA 2 1 2 NA ...
## $ crude_trend_test_p: num 0.00168 NA 0.04823 NA 0.75384 ...
genes_aggr_kgen.df[1:5,1:5]
## gene num_var inverted multiallelic ac_kgen
## NOC2L NOC2L 3 FALSE FALSE 2
## KLHL17 KLHL17 2 FALSE FALSE NA
## AGRN AGRN 3 FALSE FALSE 92
## TTLL10 TTLL10 1 FALSE FALSE NA
## PUSL1 PUSL1 1 FALSE FALSE 6
dim(genes_aggr_exac.df)
## [1] 9109 16
str(genes_aggr_exac.df)
## 'data.frame': 9109 obs. of 16 variables:
## $ gene : chr "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
## $ num_var : num 3 2 3 1 1 1 1 1 2 1 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_exac_NFE : num 14 2 2193 1 32 ...
## $ an_exac_NFE : num 108598 108618 74336 4760 54314 ...
## $ af_exac_NFE : num 1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
## $ ac_ubc : num 1 0 21 0 1 0 0 21 1 1 ...
## $ an_ubc : num 1450 912 1432 490 410 ...
## $ af_ubc : num 0.00069 0 0.01466 0 0.00244 ...
## $ ac_cbc : num 3 2 15 2 0 1 1 20 2 0 ...
## $ an_cbc : num 1358 870 1360 466 400 ...
## $ af_cbc : num 0.00221 0.0023 0.01103 0.00429 0 ...
## $ pearson_r : num 0.966 0.863 -0.944 0.844 -0.231 ...
## $ trend_call : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
## $ crude_trend_test_p: num 1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_exac.df[1:5,1:5]
## gene num_var inverted multiallelic ac_exac_NFE
## NOC2L NOC2L 3 FALSE FALSE 14
## KLHL17 KLHL17 2 FALSE FALSE 2
## AGRN AGRN 3 FALSE FALSE 2193
## TTLL10 TTLL10 1 FALSE FALSE 1
## PUSL1 PUSL1 1 FALSE FALSE 32
dim(kgen.df)
## [1] 18551 9
colnames(kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000026 Var000000026 NA NA NA NA
## Var000000029 Var000000029 1 5008 0.000199681 0
## Var000000031 Var000000031 1 5008 0.000199681 0
## Var000000053 Var000000053 NA NA NA NA
## Var000000064 Var000000064 NA NA NA NA
dim(exac.df)
## [1] 18551 48
colnames(exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000026 Var000000026 NA NA
## Var000000029 Var000000029 1.506e-04 16
## Var000000031 Var000000031 2.354e-04 25
## Var000000053 Var000000053 1.883e-05 2
## Var000000064 Var000000064 9.416e-06 1
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000026 NA NA
## Var000000029 106210 8
## Var000000031 106210 4
## Var000000053 106210 0
## Var000000064 106206 0
dim(variants.df)
## [1] 18551 67
str(variants.df)
## 'data.frame': 18551 obs. of 67 variables:
## $ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 26 29 31 53 64 167 227 237 300 400 ...
## $ SYMBOL : Factor w/ 19862 levels "A1BG","A1CF",..: 11149 11149 11149 8665 8665 540 540 540 18233 13545 ...
## $ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 2 2 2 2 2 2 2 2 2 ...
## $ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ POS : int 881871 883883 883946 897287 897792 979748 987173 989899 1132513 1246373 ...
## $ REF : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1575 539 539 539 1 1575 539 539 1044 ...
## $ ALT : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 323 989 989 989 989 664 989 989 1 ...
## $ ac_raw : int 1 2 1 1 1 53 1 1 2 1 ...
## $ af_raw : num 0.000711 0.001555 0.000774 0.000711 0.000722 ...
## $ an_raw : int 1406 1286 1292 1406 1386 1416 1420 1414 1320 1400 ...
## $ Consequence : Factor w/ 78 levels "3_prime_UTR_variant",..: 66 28 28 28 66 28 28 28 28 28 ...
## $ SIFT_call : Factor w/ 4 levels "deleterious",..: NA 1 1 1 NA 1 1 1 1 1 ...
## $ SIFT_score : num NA 0 0.01 0.04 NA 0.01 0 0.02 0.02 0.05 ...
## $ PolyPhen_call : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 3 NA 3 3 3 3 3 ...
## $ PolyPhen_score : num NA 0.936 0.946 0.997 NA 0.932 0.999 0.991 0.999 0.988 ...
## $ CLIN_SIG : Factor w/ 120 levels "association",..: NA NA NA NA NA 2 NA NA NA NA ...
## $ cDNA_position : Factor w/ 16585 levels "1","?-1","10",..: 4302 3741 3476 13013 15367 5539 11506 11969 4210 15212 ...
## $ CDS_position : Factor w/ 15333 levels "1","?-1","10",..: 3744 3218 2977 10599 13061 4871 10484 10919 3279 13242 ...
## $ Codons : Factor w/ 3014 levels "-/A","-/AA","AAA/-",..: 1025 1858 1306 1172 1300 1864 2503 374 328 1877 ...
## $ Protein_position : Factor w/ 7360 levels "1","?-1","10",..: 6008 5707 5569 1723 2739 6606 1672 1845 5745 2811 ...
## $ Amino_acids : Factor w/ 1851 levels "-/*","*","A",..: 1171 334 1303 1134 1283 381 1809 1594 1578 225 ...
## $ Existing_variation: Factor w/ 309385 levels "","FANCA:c.2534T>C",..: NA 54731 225315 291348 274420 10329 268628 74468 292466 49490 ...
## $ Multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_all : int 1 2 1 1 1 34 1 1 2 1 ...
## $ an_all : num 912 942 954 876 906 894 954 944 956 810 ...
## $ af_all : num 0.0011 0.00212 0.00105 0.00114 0.0011 ...
## $ ac_ubc : int 0 0 1 0 0 20 0 1 0 1 ...
## $ an_ubc : num 472 488 490 450 462 460 488 484 490 410 ...
## $ af_ubc : num 0 0 0.00204 0 0 ...
## $ ac_cbc : int 1 2 0 1 1 14 1 0 2 0 ...
## $ an_cbc : num 440 454 464 426 444 434 466 460 466 400 ...
## $ af_cbc : num 0.00227 0.00441 0 0.00235 0.00225 ...
## $ ac_ubc_fam : int 0 0 0 0 0 4 0 0 0 1 ...
## $ an_ubc_fam : num 134 142 142 130 132 132 142 140 142 120 ...
## $ af_ubc_fam : num 0 0 0 0 0 ...
## $ ac_ubc_nofam : int 0 0 1 0 0 16 0 1 0 0 ...
## $ an_ubc_nofam : num 338 346 348 320 330 328 346 344 348 290 ...
## $ af_ubc_nofam : num 0 0 0.00287 0 0 ...
## $ ac_cbc_fam : int 0 1 0 0 0 5 0 0 2 0 ...
## $ an_cbc_fam : num 156 162 164 152 154 148 166 164 166 144 ...
## $ af_cbc_fam : num 0 0.00617 0 0 0 ...
## $ ac_cbc_nofam : int 1 1 0 1 1 9 1 0 0 0 ...
## $ an_cbc_nofam : num 284 292 300 274 290 286 300 296 300 256 ...
## $ af_cbc_nofam : num 0.00352 0.00342 0 0.00365 0.00345 ...
## $ inverted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ac_inv : num 1 2 1 1 1 34 1 1 2 1 ...
## $ an_inv : num 912 942 954 876 906 894 954 944 956 810 ...
## $ af_inv : num 0.0011 0.00212 0.00105 0.00114 0.0011 ...
## $ ac_cbc_inv : num 1 2 0 1 1 14 1 0 2 0 ...
## $ an_cbc_inv : num 440 454 464 426 444 434 466 460 466 400 ...
## $ af_cbc_inv : num 0.00227 0.00441 0 0.00235 0.00225 ...
## $ ac_ubc_inv : num 0 0 1 0 0 20 0 1 0 1 ...
## $ an_ubc_inv : num 472 488 490 450 462 460 488 484 490 410 ...
## $ af_ubc_inv : num 0 0 0.00204 0 0 ...
## $ ac_cbc_fam_inv : num 0 1 0 0 0 5 0 0 2 0 ...
## $ an_cbc_fam_inv : num 156 162 164 152 154 148 166 164 166 144 ...
## $ af_cbc_fam_inv : num 0 0.00617 0 0 0 ...
## $ ac_cbc_nofam_inv : num 1 1 0 1 1 9 1 0 0 0 ...
## $ an_cbc_nofam_inv : num 284 292 300 274 290 286 300 296 300 256 ...
## $ af_cbc_nofam_inv : num 0.00352 0.00342 0 0.00365 0.00345 ...
## $ ac_ubc_fam_inv : num 0 0 0 0 0 4 0 0 0 1 ...
## $ an_ubc_fam_inv : num 134 142 142 130 132 132 142 140 142 120 ...
## $ af_ubc_fam_inv : num 0 0 0 0 0 ...
## $ ac_ubc_nofam_inv : num 0 0 1 0 0 16 0 1 0 0 ...
## $ an_ubc_nofam_inv : num 338 346 348 320 330 328 346 344 348 290 ...
## $ af_ubc_nofam_inv : num 0 0 0.00287 0 0 ...
## $ weight : num 25 25 23.8 25 25 ...
variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000026 Var000000026 NOC2L SNP 1 881871
## Var000000029 Var000000029 NOC2L SNP 1 883883
## Var000000031 Var000000031 NOC2L SNP 1 883946
## Var000000053 Var000000053 KLHL17 SNP 1 897287
## Var000000064 Var000000064 KLHL17 SNP 1 897792
dim(phenotypes.df)
## [1] 478 36
str(phenotypes.df)
## 'data.frame': 478 obs. of 36 variables:
## $ wes_id : chr "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## $ gwas_id : chr "id203568" "id357807" "id325472" "id304354" ...
## $ merged_id : chr "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : int 1 1 1 1 1 1 0 1 1 0 ...
## $ setno : int 203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
## $ registry : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
## $ family_history: int 1 0 1 1 1 1 1 1 1 0 ...
## $ age_dx : int 49 35 32 33 44 28 28 38 35 36 ...
## $ age_ref : num 58 36 41 34 49 28 32 44 35 38 ...
## $ rstime : num 10.17 1.83 9.75 1.59 5.66 ...
## $ eig1_gwas : num -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
## $ eig2_gwas : num 0.00266 0.00246 -0.0001 0.00595 0.01157 ...
## $ eig3_gwas : num 0.06803 0.05055 -0.00603 0.00747 0.00144 ...
## $ eig4_gwas : num -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
## $ eig5_gwas : num -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
## $ stage : num 1 2 2 1 1 1 2 1 2 1 ...
## $ er : num NA 0 0 0 NA 1 1 1 1 0 ...
## $ pr : num NA 0 0 NA NA 1 NA 1 0 0 ...
## $ hist_cat : chr "lobular" "ductal" "medullary" "ductal" ...
## $ hormone : num 0 0 0 0 1 0 0 0 0 0 ...
## $ chemo_cat : chr "no" "CMF" "Oth" "no" ...
## $ br_xray : int 1 1 0 0 1 0 0 0 1 1 ...
## $ br_xray_dose : num 1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
## $ age_menarche : num 9 13 10 12 10 13 12 11 11 NA ...
## $ age_1st_ftp : num 30 0 26 0 17 0 25 28 27 18 ...
## $ age_menopause : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ num_preg : num 1 0 1 0 1 0 1 1 1 1 ...
## $ bmi_age18 : num 20.8 22.5 23.6 18.6 19.9 ...
## $ bmi_dx : num 25.8 22.9 28.3 17.8 26.6 ...
## $ bmi_ref : num 33.3 22.9 33.1 17.8 29.8 ...
## $ eig1 : num -0.1566 -0.13939 -0.00225 -0.00914 0.01841 ...
## $ eig2 : num 0.08741 0.06847 -0.00239 0.0008 -0.02139 ...
## $ eig3 : num -0.0185 -0.0117 -0.0185 -0.0713 -0.0101 ...
## $ eig4 : num -0.03333 0.03332 0.01482 0.00719 -0.01141 ...
## $ eig5 : num 0.02116 0.09035 0.00663 0.01628 -0.02406 ...
phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568 pass 1
## P1_A02 P1_A02 id357807 P1_A02_id357807 pass 1
## P1_A03 P1_A03 id325472 P1_A03_id325472 pass 1
## P1_A04 P1_A04 id304354 P1_A04_id304354 pass 1
## P1_A05 P1_A05 id222648 P1_A05_id222648 pass 1
# Check consistency of rownames and colnames
# Note that, in contrast to the source data, genes_aggr tables have already been syncronised
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(colnames(genes_aggr_data.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_data.mx))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_kgen.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_exac.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_skat_M.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_skat_M_exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(genotypes_inv_imp_wt.mx))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
# Save data
save.image(paste(base_folder, "results", "r10c_SKAT_M_and_crude_trends.RData", sep="/"))
ls()
## [1] "base_folder" "exac.df"
## [3] "genes_aggr_data.mx" "genes_aggr_exac.df"
## [5] "genes_aggr_info.df" "genes_aggr_kgen.df"
## [7] "genes_aggr_skat_M.df" "genes_aggr_skat_M_exac.df"
## [9] "genotypes_inv_imp_wt.mx" "genotypes.mx"
## [11] "kgen.df" "phenotypes.df"
## [13] "selected_genes.df" "variants.df"
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.15.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.0.5 magrittr_1.5 rprojroot_1.2 tools_3.3.3
## [5] htmltools_0.3.5 yaml_2.1.14 Rcpp_0.12.9 stringi_1.1.2
## [9] rmarkdown_1.3 stringr_1.2.0 digest_0.6.12 evaluate_0.10
Sys.time()
## [1] "2017-05-05 22:11:09 BST"